LOAD DATA

df <- readRDS("../data/models/social-risk-crash-rate-data.rds")

HOT-ENCODE YEAR

I will treat year as a factor and one-hot encode it.

df$year <- as.factor(df$year)
year_dummies <- model.matrix(~ year - 1, data = df)

df <- cbind(df[ , !(names(df) %in% c("year"))], year_dummies)

PREPARE TARGET

We will remove all possible target variables and keep only one per model training.

# Choose your target variable (e.g., crash rate per 1,000 residents)
target_var <- "crash_rate_per_1000"

# Remove all target variables except selected
cols_to_remove <- grep("per_1000", 
                       names(df), 
                       value = TRUE)
cols_to_remove <- setdiff(cols_to_remove, target_var) # keep this column

df <- df %>% select(-all_of(cols_to_remove),)

# Create feature matrix and target vector
X <- df %>% select(-target_var, -borough, -total_population, -geoid)
y <- df[[target_var]]

glimpse(X)
## Rows: 13,518
## Columns: 44
## $ pct_male_population              <dbl> 12.460865, 11.694881, 12.229106, 13.5…
## $ pct_white_population             <dbl> 4.1225202, 3.6815086, 2.5856754, 2.36…
## $ pct_black_population             <dbl> 2.435429, 2.630197, 2.833673, 2.41262…
## $ pct_asian_population             <dbl> 0.19803330, 0.14388387, 0.23376782, 0…
## $ pct_hispanic_population          <dbl> 6.411328, 6.607147, 5.982424, 6.07935…
## $ pct_foreign_born                 <dbl> 2.909566, 2.442189, 2.187254, 2.20042…
## $ pct_age_under_18                 <dbl> 2.130762, 2.643626, 2.333613, 2.38777…
## $ pct_age_18_34                    <dbl> 1.715654, 1.653705, 1.882339, 1.96336…
## $ pct_age_35_64                    <dbl> 3.296112, 3.079115, 3.041014, 3.04350…
## $ pct_age_65_plus                  <dbl> 1.80895804, 1.78607845, 1.56929356, 1…
## $ median_income                    <dbl> 58582.658, 49964.513, 68000.000, 7086…
## $ pct_income_under_25k             <dbl> 0.5445916, 0.5544325, 0.5325841, 0.51…
## $ pct_income_25k_75k               <dbl> 1.0568123, 1.1376418, 0.9533662, 0.87…
## $ pct_below_poverty                <dbl> 1.9574830, 1.9510653, 1.8091597, 1.93…
## $ median_gross_rent                <dbl> 1579.1133, 1524.3577, 1701.0000, 1740…
## $ pct_owner_occupied               <dbl> 1.33318303, 1.35699756, 1.58439699, 1…
## $ pct_renter_occupied              <dbl> 1.2085934, 1.2305140, 1.1518859, 1.20…
## $ pct_less_than_hs                 <dbl> 1.4509748, 1.1951954, 1.1302166, 1.04…
## $ pct_hs_diploma                   <dbl> 1.5576081, 1.4196542, 1.2704773, 1.38…
## $ pct_some_college                 <dbl> 0.8473540, 0.8997538, 0.7256966, 0.93…
## $ pct_associates_degree            <dbl> 0.4912749, 0.3280552, 0.3923234, 0.32…
## $ pct_bachelors_degree             <dbl> 0.9482748, 0.9457966, 0.8537607, 0.90…
## $ pct_graduate_degree              <dbl> 0.3998749, 0.7098271, 1.1037907, 0.96…
## $ pct_in_labor_force               <dbl> 3.566504, 3.430191, 3.555304, 3.59599…
## $ pct_not_in_labor_force           <dbl> 3.448445, 3.326595, 3.162980, 3.10658…
## $ unemployment_rate                <dbl> 15.750133, 13.478747, 10.806175, 11.5…
## $ pct_commute_short                <dbl> 0.7350082, 0.4604284, 0.1585556, 0.41…
## $ pct_commute_medium               <dbl> 1.7061331, 1.6882374, 1.2521824, 1.25…
## $ pct_commute_long                 <dbl> 2.477320, 2.685832, 3.553271, 3.73746…
## $ pct_carpool                      <dbl> 0.35798327, 0.31462606, 0.00000000, 0…
## $ pct_public_transit               <dbl> 2.056500, 2.540030, 2.898721, 2.36674…
## $ pct_walk                         <dbl> 0.37702494, 0.30695226, 0.13009688, 0…
## $ pct_bike                         <dbl> 0.00000000, 0.00000000, 0.00000000, 0…
## $ pct_work_from_home               <dbl> 0.01713750, 0.04604284, 0.04675356, 0…
## $ pct_vehicle                      <dbl> 2.0165122, 1.9222885, 2.1120415, 2.03…
## $ post_pandemic                    <int> 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0…
## $ poverty_vehicle_interaction      <dbl> 3.9472883, 3.7505104, 3.8210202, 3.94…
## $ unemployment_vehicle_interaction <dbl> 31.760336, 25.910041, 22.823090, 23.4…
## $ year2018                         <dbl> 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1…
## $ year2019                         <dbl> 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0…
## $ year2020                         <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0…
## $ year2021                         <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0…
## $ year2022                         <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0…
## $ year2023                         <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0…

HYPERPARAMETER TUNING

What This Does - Optuna (Python) handles the search space and Bayesian optimization. - The final best parameters are applied to fit the final_model. - Search space: Instead of predefined grids, trial$suggest_float() and trial$suggest_int() explore a range of values. - Best parameters: study$best_params holds the optimal hyperparameters.

## CONVERT TO DMATRIX
dtrain_all <- xgb.DMatrix(data = as.matrix(X), label = y)

## Start Python venv
reticulate::use_virtualenv("r-reticulate", required = TRUE)

## OPTUNA-BASED SPATIAL CV
optuna <- import("optuna")

boroughs <- unique(df$borough)
folds <- lapply(boroughs, function(b) which(df$borough != b))

# Optuna objective
objective <- function(trial) {
  params <- list(
    booster = "gbtree",
    eta = trial$suggest_float("eta", 0.01, 0.3, log = TRUE),
    max_depth = trial$suggest_int("max_depth", 3, 12),
    min_child_weight = trial$suggest_int("min_child_weight", 1, 10),
    subsample = trial$suggest_float("subsample", 0.5, 1.0),
    colsample_bytree = trial$suggest_float("colsample_bytree", 0.5, 1.0),
    gamma = trial$suggest_float("gamma", 0, 10),
    lambda = trial$suggest_float("lambda", 0, 10),
    alpha = trial$suggest_float("alpha", 0, 10)
  )
  
  rmse_scores <- numeric(length(folds))
  for (i in seq_along(folds)) {
    train_idx <- folds[[i]]
    valid_idx <- setdiff(seq_len(nrow(dtrain_all)), train_idx)

    dtrain <- xgb.DMatrix(data = as.matrix(X[train_idx, ]), label = y[train_idx])
    dvalid <- xgb.DMatrix(data = as.matrix(X[valid_idx, ]), label = y[valid_idx])

    model <- xgb.train(
      params = params,
      data = dtrain,
      nrounds = 500,
      watchlist = list(val = dvalid),
      early_stopping_rounds = 20,
      verbose = 0
    )

    rmse_scores[i] <- min(model$evaluation_log$val_rmse)
  }
  
  preds <- predict(model, as.matrix(X[valid_idx, ]))
  return(Metrics::rmse(y[valid_idx], preds))
}

# Run Optuna study
set.seed(2025)
study <- optuna$create_study(direction = "minimize")
study$optimize(objective, n_trials = 50)

best_params <- study$best_params
print(best_params)

$eta [1] 0.2309768

$max_depth [1] 4

$min_child_weight [1] 3

$subsample [1] 0.5583975

$colsample_bytree [1] 0.8797697

$gamma [1] 3.35839

$lambda [1] 6.131609

$alpha [1] 2.805485

TRAIN/TEST SPLIT

# Set seed
set.seed(2025)

# Split by index
train_index <- createDataPartition(y, p = 0.8, list = FALSE)
X_train <- X[train_index, ]
y_train <- y[train_index]
X_test <- X[-train_index, ]
y_test <- y[-train_index]

# Convert to xgb.DMatrix
dtrain <- xgb.DMatrix(data = as.matrix(X_train), label = y_train)
dtest <- xgb.DMatrix(data = as.matrix(X_test), label = y_test)

TRAIN MODEL

# Set seed
set.seed(2025)

# Training with parallel processing
final_model <- xgb.train(
  params = list(
    eta = best_params$eta,
    max_depth = best_params$max_depth,
    gamma = best_params$gamma,
    colsample_bytree = best_params$colsample_bytree,
    min_child_weight = best_params$min_child_weight,
    subsample = best_params$subsample,
    objective = "reg:squarederror",
    eval_metric = "rmse"
  ),
  data = dtrain,
  nrounds = 1000,
  watchlist = list(train = dtrain, test = dtest),
  early_stopping_rounds = 20,
  verbose = 1,
  nthread = detectCores() - 1
)

[78] train-rmse:1.220267 test-rmse:1.555561 [79] train-rmse:1.211549 test-rmse:1.556347 [80] train-rmse:1.207152 test-rmse:1.553078 [81] train-rmse:1.205943 test-rmse:1.553145 [82] train-rmse:1.205311 test-rmse:1.552420 [83] train-rmse:1.201741 test-rmse:1.555835 [84] train-rmse:1.197931 test-rmse:1.553410 [85] train-rmse:1.191889 test-rmse:1.551760 [86] train-rmse:1.189975 test-rmse:1.551783 [87] train-rmse:1.188499 test-rmse:1.550743 Stopping. Best iteration: [67] train-rmse:1.258358 test-rmse:1.549545 ## SAVE MODEL

# Create directory if it doesn't exist
if (!dir.exists("../data/models")) {
  dir.create("../data/models", recursive = TRUE)
}

# Save the final XGBoost model
saveRDS(final_model, file = "../data/models/xgb_model.rds")

# Save the best parameters
saveRDS(best_params, file = "../data/models/xgb_best_params.rds")

cat("Model and parameters saved to ../data/models/")
# Load the final XGBoost model
final_model <- readRDS("../data/models/xgb_model.rds")

# Load the best parameters
best_params <- readRDS("../data/models/xgb_best_params.rds")

MODEL EVALUATION

library(Metrics)
library(ggplot2)
library(dplyr)

set.seed(2025)

# Predict on test set
preds <- predict(final_model, as.matrix(X_test))

# --- Metrics ---
rmse <- sqrt(mean((y_test - preds)^2))
mae <- mean(abs(y_test - preds))
mape <- mean(abs((y_test - preds) / y_test)) * 100
r2 <- 1 - (sum((y_test - preds)^2) / sum((y_test - mean(y_test))^2))

cat("Model Evaluation Metrics:\n")
## Model Evaluation Metrics:
cat("  RMSE:", rmse, "\n")
##   RMSE: 1.549545
cat("  MAE :", mae, "\n")
##   MAE : 0.8372956
cat("  MAPE:", mape, "%\n")
##   MAPE: Inf %
cat("  R²  :", r2, "\n\n")
##   R²  : 0.2595503
# --- Residuals ---
residuals <- y_test - preds
residual_df <- data.frame(
  actual = y_test,
  predicted = preds,
  residuals = residuals
)

# --- Plot: Predicted vs Actual ---
p1 <- residual_df %>%
  ggplot(aes(x = actual, y = predicted)) +
  geom_point(alpha = 0.5) +
  geom_abline(slope = 1, intercept = 0, color = "red") +
  theme_minimal() +
  labs(title = "Predicted vs Actual Crash Rates",
       x = "Actual",
       y = "Predicted")

# --- Plot: Residuals vs Predicted ---
p2 <- residual_df %>%
  ggplot(aes(x = predicted, y = residuals)) +
  geom_point(alpha = 0.5, color = "blue") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  theme_minimal() +
  labs(title = "Residuals vs Predicted",
       x = "Predicted",
       y = "Residuals")

# --- Plot: Residual Density ---
# Residual Histogram
p3 <- ggplot(residual_df, aes(x = residuals)) +
    geom_histogram(binwidth = 0.2, fill = "steelblue", color = "white") +
    geom_density(color = "red") +
    theme_minimal() +
    labs(title = "Residual Distribution", x = "Residuals", y = "Count")

# Print plots
print(p1)

print(p2)

print(p3)

ggsave("../report/plots/predicted_vs_actual_values_plot.png", p1, width = 10, height = 6, dpi = 300)
ggsave("../report/plots/resisuals_vs_predicted_values_plot.png", p2, width = 10, height = 6, dpi = 300)
ggsave("../report/plots/residual_density_plot.png", p3, width = 10, height = 6, dpi = 300)

Model Evaluation Metrics: RMSE: 1.549545 MAE : 0.8372956 MAPE: Inf % R² : 0.2595503 ## SHAP EXPLAINABILITY

# Compute SHAP values
shap_values <- shap.values(xgb_model = final_model, X_train = as.matrix(X_train))
shap_long <- shap.prep(shap_contrib = shap_values$shap_score, X_train = as.matrix(X_train))

# SHAP summary plot
print(shap.plot.summary(shap_long))

if (!dir.exists("../report/plots")) {
  dir.create("../report/plots")
}

shap <- shap.plot.summary(shap_long)
ggsave("../report/plots/shap_summary_plot.png", shap, width = 10, height = 6, dpi = 300)

INSPECT TREE ENSEMBLE

xgb.plot.tree(model = final_model, trees = 0)
xgb.plot.tree(model = final_model, trees = 1)
xgb.plot.tree(model = final_model, trees = 2)
xgb.plot.multi.trees(model = final_model)
# ============================================================
# Additional Model Diagnostics and Deeper Analysis
# ============================================================

library(ggplot2)
library(dplyr)
library(pdp)        # For Partial Dependence Plots
library(DALEX)      # For model explainability
library(ggthemes)
library(sf)

# ---------------------------
# 1. SHAP Dependence and Interaction Plots
# ---------------------------
message("\nGenerating SHAP dependence and interaction plots...")

# Assuming shap_values and shap_long are already computed
# (if not, recompute them using iml or SHAPforxgboost packages)

# Top feature by SHAP importance
top_feature <- shap_long %>%
  as_tibble() %>%
  count(variable, wt = abs(value), sort = TRUE) %>%
  dplyr::slice(1) %>%
  pull(variable)

# Dependence plot for top feature
shap.plot.dependence(data_long = shap_long, x = top_feature, color_feature = top_feature)

# Interaction values
shap_interaction_values <- predict(
  final_model,
  as.matrix(X_train),
  predinteraction = TRUE
)

# shap_interaction_values will be a 3D array: [n_samples, n_features, n_features]
interactions <- dim(shap_interaction_values)
saveRDS(interactions, "../data/models/shap_interactions.rds")
# ---------------------------
# 3. Partial Dependence Plots (PDP)
# ---------------------------
message("\nGenerating Partial Dependence Plots...")

top_features <- shap_long %>%
  count(variable, wt = abs(value), sort = TRUE) %>%
  dplyr::slice(1:10) %>%
  pull(variable)

# Ensure the output directory exists
dir.create("../report/plots/pdp", recursive = TRUE, showWarnings = FALSE)

for (f in top_features) {
  pd <- partial(final_model, pred.var = f, train = as.matrix(X_train), grid.resolution = 30)
  p <- plot(pd, main = paste("Partial Dependence of", f))
  
  # Save as PNG
  ggsave(
    filename = paste0("../report/plots/pdp/pdp_", f, ".png"),
    plot = p,
    width = 8,
    height = 6,
    dpi = 300
  )
}